function [final_episode]=simulation_exp(alpha,p1,p2,pi1,pi2,threshold,beta,min_episodes)

%The function runs 1 experiment in which a Q-learning algorithm chooses between two
%actions 1 and 2, and stops after at least a number "min_episodes" of
%episodes, and after having played the same action for "threshold" times.

%It returns final_episode, a 1 x 4 vector, the columns of which are:

%1: The action chosen in the last episode (1 or 2)
%2: The profit obtained in the last episode
%3: The index of the last episode
%4: The number of consecutive episodes before the end during which the
%player has kept playing the same action

%Initialize the Q matrix.
Q_n = [pi1(1) ; pi2(1)];

%Pre-allocate a vector of profits, optimal actions, and experiment choices.
profit = zeros(min_episodes, 1);
maxvector = zeros(min_episodes, 1);

%Determine choice whether to experiment in each episode t.
epsilon = exp(-beta*(1:1:min_episodes));
Experiment(:,1) = binornd(1,epsilon)';

%Determine the realized profit associated with each action in each episode.
proba_state1 = [p1 , 1 - p1];
proba_state2 = [p2 , 1 - p2];
profit_realization1 = randsample(1:2, threshold + min_episodes, true, proba_state1)';
profit_realization2 = randsample(1:2, threshold + min_episodes, true, proba_state2)';

%Initialize
w=1;
since_same_strat = 1;
t=0;

while true  % start an infinite loop that will break when the condition for the stopping time will be met

        t=t+1;

        if t > min_episodes %If t > min_episodes we need a new experimentation draw
            Experiment(t,:) = binornd(1,exp(-beta*(t)))';
        end
        
        if Experiment(t,:) == 1 %If experiment is 1 then we pick randomly between the two actions
            maxvector(t,1) = randi([1 2]);
        
        elseif Experiment(t,:) == 0 %If experiment is 0 then we pick the action associated with the maximum q-value
            maxvector(t,1) = datasample(find(Q_n(:,1) == max(Q_n(:,1))), 1);
        end
        
        if min_episodes*w < t %If we cross min_episodes, then we generate an additional min_episodes + threshold realizations of the profit. If these new realizations
            %are all used, we generate the same number again, etc.
            profit_realization1 = [profit_realization1; randsample(1:2, threshold + min_episodes, true, proba_state1)'];
            profit_realization2 = [profit_realization2; randsample(1:2, threshold + min_episodes, true, proba_state2)'];
            w = w+1;
        end

        if maxvector(t,1) == 1
            profit(t,1) = [pi1(profit_realization1(t,:))]; %Record the profit if action 1 was chosen
        elseif maxvector(t,1) == 2
            profit(t,1) = [pi2(profit_realization2(t,:))]; %Record the profit if action 2 was chosen
        end 
        
        Q_n(maxvector(t,1),1) = alpha * profit(t) + (1-alpha) * Q_n(maxvector(t),1); %Update the Q-matrix
        
        if t>1 && t-1 >= since_same_strat && maxvector(t-1,1) == maxvector(t,1)
            since_same_strat = since_same_strat+1; %Update the count of for how long the algo has played the same action
        else
            since_same_strat = 1;  %Reset count if the action just changed
        end

        if since_same_strat > threshold && t >= min_episodes
            break;  %Exit the for loop when the condition is met
        end
end

%Record the final episode
final_episode = [maxvector(end), profit(end),t, since_same_strat];